---
title: "Data Cleaning"
output: 
---

# Data Cleaning

# Who's to Blame? Postconflict Violence and Public Attitudes Towards Peace Agreements
# Wyer, Frank. 

#clear environment
```{r clear environment}
rm(list = ls())
```

# uncomment and set working directory to replication archive
# setwd("~/blame_replication")

# Uncomment to install packages if necessary
# install.packages("tidyverse")
# install.packages("readxl")
# install.packages("stringi")
# install.packages("fuzzyjoin")


#load packages
```{r}
library(tidyverse)
library(readxl)
library(stringi)
library(fuzzyjoin)
```


#delete (read in data from my workspace)
setwd("C:/Users/frank.wyer/Dropbox/blame_replication")

#read in data
```{r}
survey_raw <- read_xlsx("Survey Files/survey_raw.xlsx") #raw survey data

survey_municipalities <- read_xlsx("Raw Data Files/survey_municipality_codes.xlsx") #match survey municipalities to municipality codes

survey_regions <- read_xlsx("Raw Data Files/survey_regions.xlsx") #match survey municipalities to regions

homrate_21_22 <- read.csv("Raw Data Files/homrate_21_22.csv") #municipal homicide rate for 2021-2022

hom_rate_change <- read.csv("Raw Data Files/hom_rate_change.csv") #change in municipal homicide rate between five years before and five years after peace agreement

farc_presence <- read.csv("Raw Data Files/indepaz_farc_presence_2012.csv") #municipality-level data from Indepaz on FARC presence in 2012

```

## Merge survey data to observational data using municipality names and codes

```{r merge observational data to survey municipalities}
survey_municipalities <- left_join(survey_municipalities, farc_presence %>% dplyr::select(municode, farc_presence), by = "municode")

survey_municipalities <- stringdist_left_join(survey_municipalities, survey_regions, by = "muniname", ignore_case = TRUE, max_dist = 1, method = "lcs")

survey_municipalities <- left_join(survey_municipalities, homrate_21_22 %>% dplyr::select(municode, homrate100k), by = "municode")

survey_municipalities <- left_join(survey_municipalities, hom_rate_change %>% dplyr::select(municode, homratediff), by = "municode")

survey_raw <- left_join(survey_raw, survey_municipalities %>% mutate(Q3 = as.numeric(Q3)), by = "Q3")

```

# Create Index Variables

##Engagement Index 
```{r dummy for voted in last election}
survey_raw <- survey_raw %>% mutate(vote_dummy = ifelse(Q28 == 1, 1, 0))
```

```{r dummy for VP name question}
survey_raw <- survey_raw %>% 
mutate(vpname_clean = stri_trans_general(tolower(Q29), "Latin-ASCII"), vpname_dummy = ifelse(str_detect(vpname_clean, ("martha|marta|lucia|ramirez|francia|elena|marquez|mina")), 1, 0)) # 1 if they mention the first or last names of the VP or the leading VP candidate at the time
```

```{r create news consumption variable}
na_convert <- function(x) ifelse(is.na(x), 0, x) #helper function converting NA to zero

minusone <- function(x) x - 1 #helper function to fix issue with news scale

#the data has a range of 2-8 when it should be 1-7 and has NAs for zero times per week consuming news

survey_raw <- survey_raw %>% mutate(across(Q30R1:Q30R7, ~minusone(.))) #fixes range
survey_raw <- survey_raw %>% mutate(across(Q30R1:Q30R7, ~na_convert(.))) #converts NA to zero which is appropriate here

survey_raw <- survey_raw %>% rowwise() %>% mutate(tot_news=sum(c_across(Q30R1:Q30R7))) ##add up total news consumption
```

```{r zscore variables and combine into engagement index}
survey_raw$vote_scale <- c(scale(survey_raw$vote_dummy))
survey_raw$vpname_scale <- c(scale(survey_raw$vpname_dummy))
survey_raw$news_scale <- c(scale(survey_raw$tot_news))

survey_raw <- survey_raw %>% mutate(engage_zscale = vote_scale + vpname_scale + news_scale)
```

## SES Index

```{r dummy variables for homeowner and for not seeking employment}
survey_raw <- survey_raw %>% mutate(emp_dummy = ifelse(Q20 != 2, 1, 0), homeowner_dummy = ifelse(Q22 < 3, 1, 0))
```

```{r recode asset variables for true NAs and zeros}
#For the assets variables, NAs are zeroes unless otherwise noted in Q23A11. To account for this, we first convert NAs to zeroes, then restore true NAs using Q23A11. 

survey_raw <- survey_raw %>% mutate(across(Q23A1:Q23A10, ~na_convert(.)))
survey_raw <- survey_raw %>% mutate(across(Q23A1:Q23A10, ~ifelse(!is.na(Q23A11), NA, .)))
```

```{r impute assets for 3 missing based on location}
assetimpute <- survey_raw %>% filter(is.na(Q23A11)) %>% group_by(Q3, Q2) %>% summarise(across(Q23A1:Q23A10, mean, .names = "{.col}_impute"))

survey_raw <- left_join(survey_raw, assetimpute, by = c("Q3", "Q2"))

survey_raw <- survey_raw %>% mutate(
Q23A1_plus = ifelse(!is.na(Q23A11), Q23A1_impute, Q23A1), 
Q23A2_plus = ifelse(!is.na(Q23A11), Q23A2_impute, Q23A2), 
Q23A3_plus = ifelse(!is.na(Q23A11), Q23A3_impute, Q23A3), 
Q23A4_plus = ifelse(!is.na(Q23A11), Q23A4_impute, Q23A4), 
Q23A5_plus = ifelse(!is.na(Q23A11), Q23A5_impute, Q23A5), 
Q23A6_plus = ifelse(!is.na(Q23A11), Q23A6_impute, Q23A6), 
Q23A7_plus = ifelse(!is.na(Q23A11), Q23A7_impute, Q23A7), 
Q23A8_plus = ifelse(!is.na(Q23A11), Q23A8_impute, Q23A8), 
Q23A9_plus = ifelse(!is.na(Q23A11), Q23A9_impute, Q23A9), 
Q23A10_plus = ifelse(!is.na(Q23A11), Q23A10_impute, Q23A10))

assetnames <- c("homeowner_dummy", "Q23A1_plus", "Q23A2_plus", "Q23A3_plus", "Q23A4_plus", "Q23A5_plus", "Q23A6_plus", "Q23A7_plus", "Q23A8_plus", "Q23A9_plus", "Q23A10_plus")
```

```{r create PCA index of assets}
survey_raw$asset_index <- prcomp(survey_raw[,assetnames], scale = T)$x[,"PC1"]
```

```{r impute missing strata based on location}
strataimpute <- survey_raw %>% filter(Q16 != 7) %>% group_by(Q3, Q2) %>% summarise(strataimputeval = mean(Q16)) # here no strata is coded as 7
survey_raw <- left_join(survey_raw, strataimpute, by = c("Q3", "Q2"))
survey_raw <- survey_raw %>% mutate(strata_plus = ifelse(Q16 == 7, strataimputeval, Q16))
```

```{r create zscore index of employment, assets, and strata}
survey_raw$emp_scale <- c(scale(survey_raw$emp_dummy))
survey_raw$strata_scale <- c(scale(survey_raw$strata_plus))
survey_raw$asset_scale <- c(scale(survey_raw$asset_index))

survey_raw <- survey_raw %>% mutate(ses_zscale = emp_scale + strata_scale + asset_scale)
```

#clean other covariates
```{r}
survey_raw <- survey_raw %>% mutate(urbandummy = ifelse(Q2 == 1, 1, 0)) #recode urban variable as binary
survey_raw <- survey_raw %>% mutate(education_level = ifelse(Q19 == 10, 0, Q19)) #10 means they answered none when asked how many years of edu
survey_raw <- survey_raw %>% mutate(female = ifelse(Q14 == 2, 1, 0)) #recode sex variable as binary
survey_raw <- survey_raw %>% mutate(eth_minority = ifelse(Q17 == 6, 0, 1)) #binary variable for belonging to an ethnic minority
survey_raw <- survey_raw %>% mutate(relig_minority = ifelse(Q18 == 1, 0, 1)) #binary variable for belonging to a religious minority

survey_raw <- survey_raw %>% mutate(victimized = case_when(
Q53 == 1 ~ 1, 
Q53 == 2 ~ 0, 
Q53 == 3 ~ NA_real_, 
))

survey_raw <- survey_raw %>% mutate(age_cat = Q15, area_cat = Q2, sex_cat = Q14) #rename key variables
```

#clean treatment variable

##treatment indicator
```{r recode treatment indicator}
survey_raw <- survey_raw %>% mutate(treatment = case_when(
Q10 == 1 ~ "C", #control group
Q10 == 2 ~ "T1", #postconflict violence treatment
Q10 == 3 ~ "T2A", #government culpability treatment
Q10 == 4 ~ "T2B", #rebel culpability treatment
))
```

## clean outcome variables

```{r}
#each outcome is coded in two columns (i.e., a response is coded in only one of the two columns). I combine these into a single column for ease of analysis, also noting that missing or nonresponses are coded as 5. For the ELN and dissident outcomes, I also recode such that they are in the same direction as the accords outcome (1 is low support and 4 is high support)

#peace accord outcome

survey_raw <- survey_raw %>% mutate(
accords_out = case_when(
is.na(Q36) ~ Q42, 
is.na(Q42) ~ Q36
))

survey_raw <- survey_raw %>% mutate(accords_out = ifelse(accords_out == 5, NA, accords_out)) # here 5 means did not respond and is NA

#ELN outcome

survey_raw <- survey_raw %>% mutate(
eln_out = case_when(
is.na(Q37) ~ Q40, 
is.na(Q40) ~ Q37
))

survey_raw <- survey_raw %>% mutate(eln_out = ifelse(eln_out == 5, NA, eln_out)) # here 5 means did not respond and is NA

survey_raw <- survey_raw %>% mutate(eln_out_invert = case_when(
  eln_out == 1 ~ 4,
  eln_out == 2 ~ 3,
  eln_out == 3 ~ 2,
  eln_out == 4 ~ 1,
  TRUE ~ NA_real_
  )) # changing scale order from 4-1 to 1-4

#dissident outcome

survey_raw <- survey_raw %>% mutate(
dissident_out = case_when(
is.na(Q38) ~ Q41, 
is.na(Q41) ~ Q38
))

survey_raw <- survey_raw %>% mutate(dissident_out = ifelse(dissident_out == 5, NA, dissident_out)) # here 5 means did not respond and is NA

survey_raw <- survey_raw %>% mutate(dissident_out_invert = case_when(
  dissident_out == 1 ~ 4,
  dissident_out == 2 ~ 3,
  dissident_out == 3 ~ 2,
  dissident_out == 4 ~ 1,
  TRUE ~ NA_real_
  )) # changing scale order from 4-1 to 1-4

```

#zscale outcomes relative to control and create outcome index
```{r zscale outcome measures and combine into outcome index}

survey_raw$eln_scale <- (survey_raw$eln_out_invert - mean(survey_raw$eln_out_invert[survey_raw$treatment == "C"], na.rm = TRUE)) / sd(survey_raw$eln_out_invert[survey_raw$treatment == "C"], na.rm = TRUE)
  
survey_raw$dissident_scale <- (survey_raw$dissident_out_invert - mean(survey_raw$dissident_out_invert[survey_raw$treatment == "C"], na.rm = TRUE)) / sd(survey_raw$dissident_out_invert[survey_raw$treatment == "C"], na.rm = TRUE)

survey_raw$accords_scale <- (survey_raw$accords_out - mean(survey_raw$accords_out[survey_raw$treatment == "C"], na.rm = TRUE)) / sd(survey_raw$accords_out[survey_raw$treatment == "C"], na.rm = TRUE)


survey_raw <- survey_raw %>% mutate(outcomes_zscale = eln_scale + dissident_scale + accords_scale)
```


#write final data file
```{r}
write.csv(survey_raw, file = "survey_clean.csv")
```


